# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation
from scipy import stats
# read only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
# join suggestion cluster and group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores'])
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df.head(3)
# filter everything with sim_score >= 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.4].reset_index(drop=True)
cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)
cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)
cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)
hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
delays = []
for i in range(0, 61, 3):
delays.append(i)
%load_ext autoreload
%autoreload 2
from analysis import get_correlation
dfs = []
for i in delays:
dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
## set to *.json to load all
#input_loc = '../../data/Analysis/*.json'
#input_files = glob.glob(input_loc)
#
#dfs = []
#for file in input_files:
# data = pd.read_json(file)
# dfs.append(data)
for i in range(len(dfs)):
dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)
# scatter plot high performer
fig = go.Figure()
for i in range(len(dfs)):
fig.add_trace(go.Scatter(x=dfs[i][dfs[i]['pearsonr']>=0.5]['pearsonr'],
y=dfs[i][dfs[i]['pearsonr']>=0.5]['similarity_scores'],
name=delays[i], mode='markers', marker=dict(color=colors[i])))
fig.update_layout(template='simple_white',
font=dict(family='Computer Modern', color='black', size=15))
fig.update_yaxes(title_text='Similarity Score')
fig.update_xaxes(title_text='Korrelation')
fig.show()
mean = []
for i in range(len(dfs)):
mean.append(dfs[i]['pearsonr'].mean())
fig = px.line(x=delays, y=mean, labels={'x':'Delay', 'y':'Mittlere Korrelation'},
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
plot_dict = {'Delay':[], 'Partei':[], 'Mittlere Korrelation':[]}
for i in range(len(dfs)):
for party in set(dfs[i]['party']):
plot_dict['Delay'].append(delays[i])
plot_dict['Partei'].append(party)
plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['party']==party]['pearsonr'].mean())
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
template='simple_white', color='Partei', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
plot_dict = {'Delay':[], 'Gender':[], 'Mittlere Korrelation':[]}
for i in range(len(dfs)):
for gender in set(dfs[i]['gender']):
plot_dict['Delay'].append(delays[i])
plot_dict['Gender'].append(gender)
plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['gender']==gender]['pearsonr'].mean())
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
template='simple_white', color='Gender', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
plot_dict = {'Delay':[], 'Cluster':[], 'Mittlere Korrelation':[]}
for i in range(len(dfs)):
for cluster in set(dfs[i]['cluster']):
plot_dict['Delay'].append(delays[i])
plot_dict['Cluster'].append(cluster)
plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['cluster']==cluster]['pearsonr'].mean())
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
template='simple_white', color='Cluster', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
plot_dict = {'Delay':[], 'Hashtag':[], 'Mittlere Korrelation':[]}
for i in range(len(dfs)):
for hashtag in set(dfs[i]['hashtags']):
plot_dict['Delay'].append(delays[i])
plot_dict['Hashtag'].append(hashtag)
plot_dict['Mittlere Korrelation'].append(dfs[i][dfs[i]['hashtags']==hashtag]['pearsonr'].mean())
fig = px.line(plot_dict, x='Delay', y='Mittlere Korrelation',
template='simple_white', color='Hashtag', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()